/*******************************************************************************
* Copyright 2019-2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#include "cpu/x64/jit_generator.hpp"

#include "cpu/x64/gemm/f32/common_f32.hpp"

namespace dnnl {
namespace impl {
namespace cpu {
namespace x64 {

jit_sse41_kernel_b0_sgemm_kern::jit_sse41_kernel_b0_sgemm_kern()
    : jit_generator(nullptr, F32_COMPUTE_KERNEL_CODE_SIZE) {}

void jit_sse41_kernel_b0_sgemm_kern::generate() {

#ifndef _WIN32

#define M rdi
#define N rsi
#define K rdx
#define A r8
#define B r9
#define C rcx
#define LDC r10

#define AA r15
#define I r11
#define J r12
#define H rax
#define AO rbx
#define BO rbp
#define CO1 r13
#define CO2 r14

#define OLD_C (8 + stacksize + rsp)
#define OLD_LDC (16 + stacksize + rsp)

#else

#define M rcx
#define N rdx
#define K r8
#define A rdi
#define B rsi
#define C r9
#define LDC r10
#define AA r15
#define I r11
#define J r12
#define H rax
#define AO rbx
#define BO rbp
#define CO1 r13
#define CO2 r14

#define OLD_A 40 + stacksize + rsp
#define OLD_B 48 + stacksize + rsp
#define OLD_C 56 + stacksize + rsp
#define OLD_LDC 64 + stacksize + rsp

#endif

    inLocalLabel();
    {

        Xbyak::Label l1090;
        Xbyak::Label l10ac;
        Xbyak::Label l11a4;
        Xbyak::Label l11b0;
        Xbyak::Label l11fc;
        Xbyak::Label l1264;
        Xbyak::Label l12c4;
        Xbyak::Label l13c0;
        Xbyak::Label l13d0;
        Xbyak::Label l14cc;
        Xbyak::Label l14d8;
        Xbyak::Label l1528;
        Xbyak::Label l1568;
        Xbyak::Label l15c8;
        Xbyak::Label l16c4;
        Xbyak::Label l16d0;
        Xbyak::Label l17cc;
        Xbyak::Label l17d8;
        Xbyak::Label l1828;
        Xbyak::Label l1860;
        Xbyak::Label l1864;
        Xbyak::Label l1894;
        Xbyak::Label l18e8;
        Xbyak::Label l19e4;
        Xbyak::Label l1a00;
        Xbyak::Label l1afc;
        Xbyak::Label l1b08;
        Xbyak::Label l1b58;
        Xbyak::Label l1bc0;
        Xbyak::Label l1c24;
        Xbyak::Label l1d24;
        Xbyak::Label l1d34;
        Xbyak::Label l1e34;
        Xbyak::Label l1e40;
        Xbyak::Label l1e90;
        Xbyak::Label l1ed0;
        Xbyak::Label l1f34;
        Xbyak::Label l2034;
        Xbyak::Label l2040;
        Xbyak::Label l2140;
        Xbyak::Label l214c;
        Xbyak::Label l219c;
        Xbyak::Label l21d4;
        Xbyak::Label l21d8;
        Xbyak::Label l2208;
        Xbyak::Label l225c;
        Xbyak::Label l2358;
        Xbyak::Label l2370;
        Xbyak::Label l246c;
        Xbyak::Label l2478;
        Xbyak::Label l24c8;
        Xbyak::Label l2534;
        Xbyak::Label l2598;
        Xbyak::Label l2698;
        Xbyak::Label l26a8;
        Xbyak::Label l27a8;
        Xbyak::Label l27b4;
        Xbyak::Label l27c;
        Xbyak::Label l2804;
        Xbyak::Label l2844;
        Xbyak::Label l28a8;
        Xbyak::Label l298;
        Xbyak::Label l29a8;
        Xbyak::Label l29b4;
        Xbyak::Label l2ab4;
        Xbyak::Label l2ac0;
        Xbyak::Label l2b10;
        Xbyak::Label l2b4c;
        Xbyak::Label l2b50;
        Xbyak::Label l444;
        Xbyak::Label l450;
        Xbyak::Label l4bc;
        Xbyak::Label l50;
        Xbyak::Label l578;
        Xbyak::Label l5e4;
        Xbyak::Label l74;
        Xbyak::Label l794;
        Xbyak::Label l7a4;
        Xbyak::Label l954;
        Xbyak::Label l960;
        Xbyak::Label l9d0;
        Xbyak::Label la44;
        Xbyak::Label lab0;
        Xbyak::Label lc60;
        Xbyak::Label lc6c;
        Xbyak::Label ld0;
        Xbyak::Label le1c;
        Xbyak::Label le28;
        Xbyak::Label le98;
        Xbyak::Label lf00;
        Xbyak::Label lf14;
        Xbyak::Label lf44;
        Xbyak::Label lf98;

        preamble();
        auto stacksize = get_size_of_abi_save_regs();
#ifdef _WIN32
        mov(A, ptr[OLD_A]);
        mov(B, ptr[OLD_B]);
#endif
        mov(C, ptr[OLD_C]);
        mov(LDC, ptr[OLD_LDC]);

        mov(M, qword[M]);
        mov(N, qword[N]);
        mov(K, qword[K]);
        shl(LDC, 0x2);
        sub(A, -128);
        sub(B, -128);
        mov(J, M);
        cmp(J, 0x8);
        jl(lf14, T_NEAR);
        align(4);

        L(l50);
        mov(AA, K);
        imul(AA, AA, 0x20);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x20);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(l578, T_NEAR);
        align(4);

        L(l74);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        movups(xmm1, xword[A - 0x70]);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x60]);
        xorps(xmm10, xmm10);
        movups(xmm3, xword[A - 0x50]);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l444, T_NEAR);
        sub(H, 0x1e);
        jle(l27c, T_NEAR);
        align(4);

        L(ld0);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -64);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(ld0, T_NEAR);
        align(4);

        L(l27c);
        prefetcht0(byte[CO1 + 0x1c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x1c]);
        prefetcht0(byte[CO2 + 0x1c]);
        prefetcht0(byte[CO2 + LDC * 1 + 0x1c]);
        add(H, 0x1e);
        align(4);

        L(l298);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -64);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l298, T_NEAR);
        align(4);

        L(l444);
        mov(H, K);
        and_(H, 0x3);
        je(l4bc, T_NEAR);
        align(4);

        L(l450);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        sub(AO, -32);
        sub(BO, -16);
        dec(H);
        jg(l450, T_NEAR);
        align(4);

        L(l4bc);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movaps(xmm0, xmm12);
        unpcklpd(xmm12, xmm13);
        unpckhpd(xmm0, xmm13);
        movaps(xmm1, xmm14);
        unpckhpd(xmm14, xmm15);
        unpcklpd(xmm1, xmm15);
        movaps(xmm13, xmm12);
        shufps(xmm12, xmm14, 0xcc);
        shufps(xmm13, xmm14, 0x66);
        movaps(xmm14, xmm1);
        movaps(xmm15, xmm1);
        shufps(xmm14, xmm0, 0xcc);
        shufps(xmm15, xmm0, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + 0x10], xmm12);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        movups(xword[CO1 + LDC * 1 + 0x10], xmm13);
        movups(xword[CO2], xmm10);
        movups(xword[CO2 + 0x10], xmm14);
        movups(xword[CO2 + LDC * 1], xmm11);
        movups(xword[CO2 + LDC * 1 + 0x10], xmm15);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(l74, T_NEAR);
        align(4);

        L(l578);
        test(I, 0x2);
        jle(la44, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        movups(xmm1, xword[A - 0x70]);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x60]);
        xorps(xmm10, xmm10);
        movups(xmm3, xword[A - 0x50]);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l954, T_NEAR);
        sub(H, 0x1e);
        jle(l794, T_NEAR);
        align(4);

        L(l5e4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -32);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l5e4, T_NEAR);
        align(4);

        L(l794);
        prefetcht0(byte[CO1 + 0x1c]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x1c]);
        add(H, 0x1e);
        align(4);

        L(l7a4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -32);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l7a4, T_NEAR);
        align(4);

        L(l954);
        mov(H, K);
        and_(H, 0x3);
        je(l9d0, T_NEAR);
        align(4);

        L(l960);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        sub(AO, -32);
        sub(BO, -8);
        dec(H);
        jg(l960, T_NEAR);
        align(4);

        L(l9d0);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm0, xmm12);
        unpcklpd(xmm12, xmm13);
        unpckhpd(xmm0, xmm13);
        movaps(xmm1, xmm14);
        unpckhpd(xmm14, xmm15);
        unpcklpd(xmm1, xmm15);
        movaps(xmm13, xmm12);
        shufps(xmm12, xmm14, 0xcc);
        shufps(xmm13, xmm14, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + 0x10], xmm12);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        movups(xword[CO1 + LDC * 1 + 0x10], xmm13);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(la44);
        test(I, 0x1);
        jle(lf00, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        movups(xmm1, xword[A - 0x70]);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x60]);
        xorps(xmm10, xmm10);
        movups(xmm3, xword[A - 0x50]);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(le1c, T_NEAR);
        sub(H, 0x1e);
        jle(lc60, T_NEAR);
        align(4);

        L(lab0);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -16);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(lab0, T_NEAR);
        align(4);

        L(lc60);
        prefetcht0(byte[CO1 + 0x1c]);
        add(H, 0x1e);
        align(4);

        L(lc6c);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x30]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO - 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        prefetcht0(byte[AO + 0x1c0]);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO + 0x10]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm8, xmm5);
        addps(xmm12, xmm7);
        pshufd(xmm5, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm5, 0xb1);
        movaps(xmm7, xmm5);
        mulps(xmm5, xmm2);
        mulps(xmm7, xmm3);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        addps(xmm14, xmm7);
        add(AA, 0x8);
        sub(BO, -16);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO + 0x20]);
        mulps(xmm7, xmm3);
        movups(xmm3, xword[AO + 0x30]);
        sub(AO, -128);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(lc6c, T_NEAR);
        align(4);

        L(le1c);
        mov(H, K);
        and_(H, 0x3);
        je(le98, T_NEAR);
        align(4);

        L(le28);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm8, xmm4);
        addps(xmm12, xmm7);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm9, xmm6);
        addps(xmm13, xmm7);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        mulps(xmm7, xmm1);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        addps(xmm14, xmm7);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        mulps(xmm7, xmm1);
        movups(xmm1, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        addps(xmm15, xmm7);
        sub(AO, -32);
        sub(BO, -4);
        dec(H);
        jg(le28, T_NEAR);
        align(4);

        L(le98);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm0, xmm12);
        unpcklpd(xmm12, xmm13);
        unpckhpd(xmm0, xmm13);
        movaps(xmm1, xmm14);
        unpckhpd(xmm14, xmm15);
        unpcklpd(xmm1, xmm15);
        movaps(xmm13, xmm12);
        shufps(xmm12, xmm14, 0xcc);
        shufps(xmm13, xmm14, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + 0x10], xmm12);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(lf00);
        mov(A, AO);
        sub(J, 0x8);
        cmp(J, 0x8);
        jge(l50, T_NEAR);
        align(4);

        L(lf14);
        test(J, 0x4);
        jle(l1864, T_NEAR);
        mov(AA, K);
        imul(AA, AA, 0x10);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x10);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(l1264, T_NEAR);
        align(4);

        L(lf44);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x70]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l11a4, T_NEAR);
        sub(H, 0x1e);
        jle(l1090, T_NEAR);
        align(4);

        L(lf98);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(lf98, T_NEAR);
        align(4);

        L(l1090);
        prefetcht0(byte[CO1 + 0xc]);
        prefetcht0(byte[CO1 + LDC * 1 + 0xc]);
        prefetcht0(byte[CO2 + 0xc]);
        prefetcht0(byte[CO2 + LDC * 1 + 0xc]);
        add(H, 0x1e);
        align(4);

        L(l10ac);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l10ac, T_NEAR);
        align(4);

        L(l11a4);
        mov(H, K);
        and_(H, 0x3);
        je(l11fc, T_NEAR);
        align(4);

        L(l11b0);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x70]);
        addps(xmm11, xmm6);
        sub(AO, -16);
        sub(BO, -16);
        dec(H);
        jg(l11b0, T_NEAR);
        align(4);

        L(l11fc);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        movups(xword[CO2], xmm10);
        movups(xword[CO2 + LDC * 1], xmm11);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(lf44, T_NEAR);
        align(4);

        L(l1264);
        test(I, 0x2);
        jle(l1568, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x70]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l14cc, T_NEAR);
        sub(H, 0x1e);
        jle(l13c0, T_NEAR);
        align(4);

        L(l12c4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l12c4, T_NEAR);
        align(4);

        L(l13c0);
        prefetcht0(byte[CO1 + 0xc]);
        prefetcht0(byte[CO1 + LDC * 1 + 0xc]);
        add(H, 0x1e);
        align(4);

        L(l13d0);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l13d0, T_NEAR);
        align(4);

        L(l14cc);
        mov(H, K);
        and_(H, 0x3);
        je(l1528, T_NEAR);
        align(4);

        L(l14d8);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x70]);
        addps(xmm11, xmm6);
        sub(AO, -16);
        sub(BO, -8);
        dec(H);
        jg(l14d8, T_NEAR);
        align(4);

        L(l1528);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        movups(xword[CO1 + LDC * 1 + 0x0], xmm9);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(l1568);
        test(I, 0x1);
        jle(l1860, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movups(xmm0, xword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movups(xmm2, xword[A - 0x70]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l17cc, T_NEAR);
        sub(H, 0x1e);
        jle(l16c4, T_NEAR);
        align(4);

        L(l15c8);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l15c8, T_NEAR);
        align(4);

        L(l16c4);
        prefetcht0(byte[CO1 + 0xc]);
        add(H, 0x1e);
        align(4);

        L(l16d0);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x50]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x40]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movups(xmm2, xword[AO - 0x30]);
        sub(AO, -64);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l16d0, T_NEAR);
        align(4);

        L(l17cc);
        mov(H, K);
        and_(H, 0x3);
        je(l1828, T_NEAR);
        align(4);

        L(l17d8);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movups(xmm0, xword[AO - 0x70]);
        addps(xmm11, xmm6);
        sub(AO, -16);
        sub(BO, -4);
        dec(H);
        jg(l17d8, T_NEAR);
        align(4);

        L(l1828);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movups(xword[CO1 + 0x0], xmm8);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(l1860);
        mov(A, AO);
        align(4);

        L(l1864);
        test(J, 0x2);
        jle(l21d8, T_NEAR);
        mov(AA, K);
        imul(AA, AA, 0x8);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x8);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(l1bc0, T_NEAR);
        align(4);

        L(l1894);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movsd(xmm0, qword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movsd(xmm2, qword[A - 0x78]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l1afc, T_NEAR);
        sub(H, 0x1e);
        jle(l19e4, T_NEAR);
        align(4);

        L(l18e8);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l18e8, T_NEAR);
        align(4);

        L(l19e4);
        prefetcht0(byte[CO1 + 0x4]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x4]);
        prefetcht0(byte[CO2 + 0x4]);
        prefetcht0(byte[CO2 + LDC * 1 + 0x4]);
        add(H, 0x1e);
        align(4);

        L(l1a00);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l1a00, T_NEAR);
        align(4);

        L(l1afc);
        mov(H, K);
        and_(H, 0x3);
        je(l1b58, T_NEAR);
        align(4);

        L(l1b08);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x78]);
        addps(xmm11, xmm6);
        sub(AO, -8);
        sub(BO, -16);
        dec(H);
        jg(l1b08, T_NEAR);
        align(4);

        L(l1b58);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movlps(qword[CO1 + 0x0], xmm8);
        movlps(qword[CO1 + LDC * 1 + 0x0], xmm9);
        movlps(qword[CO2], xmm10);
        movlps(qword[CO2 + LDC * 1], xmm11);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(l1894, T_NEAR);
        align(4);

        L(l1bc0);
        test(I, 0x2);
        jle(l1ed0, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movsd(xmm0, qword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movsd(xmm2, qword[A - 0x78]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l1e34, T_NEAR);
        sub(H, 0x1e);
        jle(l1d24, T_NEAR);
        align(4);

        L(l1c24);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l1c24, T_NEAR);
        align(4);

        L(l1d24);
        prefetcht0(byte[CO1 + 0x4]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x4]);
        add(H, 0x1e);
        align(4);

        L(l1d34);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l1d34, T_NEAR);
        align(4);

        L(l1e34);
        mov(H, K);
        and_(H, 0x3);
        je(l1e90, T_NEAR);
        align(4);

        L(l1e40);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x78]);
        addps(xmm11, xmm6);
        sub(AO, -8);
        sub(BO, -8);
        dec(H);
        jg(l1e40, T_NEAR);
        align(4);

        L(l1e90);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movlps(qword[CO1 + 0x0], xmm8);
        movlps(qword[CO1 + LDC * 1 + 0x0], xmm9);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(l1ed0);
        test(I, 0x1);
        jle(l21d4, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movsd(xmm0, qword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movsd(xmm2, qword[A - 0x78]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l2140, T_NEAR);
        sub(H, 0x1e);
        jle(l2034, T_NEAR);
        align(4);

        L(l1f34);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l1f34, T_NEAR);
        align(4);

        L(l2034);
        prefetcht0(byte[CO1 + 0x4]);
        add(H, 0x1e);
        align(4);

        L(l2040);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x68]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x60]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movsd(xmm2, qword[AO - 0x58]);
        sub(AO, -32);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l2040, T_NEAR);
        align(4);

        L(l2140);
        mov(H, K);
        and_(H, 0x3);
        je(l219c, T_NEAR);
        align(4);

        L(l214c);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movsd(xmm0, qword[AO - 0x78]);
        addps(xmm11, xmm6);
        sub(AO, -8);
        sub(BO, -4);
        dec(H);
        jg(l214c, T_NEAR);
        align(4);

        L(l219c);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movlps(qword[CO1 + 0x0], xmm8);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(l21d4);
        mov(A, AO);
        align(4);

        L(l21d8);
        test(J, 0x1);
        jle(l2b50, T_NEAR);
        mov(AA, K);
        imul(AA, AA, 0x4);
        add(AA, A);
        mov(CO1, C);
        add(C, 0x4);
        mov(BO, B);
        mov(I, N);
        cmp(I, 0x4);
        jl(l2534, T_NEAR);
        align(4);

        L(l2208);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movss(xmm0, dword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movss(xmm2, dword[A - 0x7c]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movaps(xmm4, xword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movaps(xmm5, xword[BO - 0x70]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l246c, T_NEAR);
        sub(H, 0x1e);
        jle(l2358, T_NEAR);
        align(4);

        L(l225c);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l225c, T_NEAR);
        align(4);

        L(l2358);
        prefetcht0(byte[CO1 + 0x0]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x0]);
        prefetcht0(byte[CO2]);
        prefetcht0(byte[CO2 + LDC * 1]);
        add(H, 0x1e);
        align(4);

        L(l2370);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x50]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x40]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movaps(xmm5, xword[BO - 0x30]);
        add(AA, 0x8);
        sub(BO, -64);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l2370, T_NEAR);
        align(4);

        L(l246c);
        mov(H, K);
        and_(H, 0x3);
        je(l24c8, T_NEAR);
        align(4);

        L(l2478);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movaps(xmm4, xword[BO - 0x70]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x7c]);
        addps(xmm11, xmm6);
        sub(AO, -4);
        sub(BO, -16);
        dec(H);
        jg(l2478, T_NEAR);
        align(4);

        L(l24c8);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movaps(xmm10, xmm1);
        movaps(xmm11, xmm1);
        shufps(xmm10, xmm0, 0xcc);
        shufps(xmm11, xmm0, 0x66);
        movss(dword[CO1 + 0x0], xmm8);
        movss(dword[CO1 + LDC * 1 + 0x0], xmm9);
        movss(dword[CO2], xmm10);
        movss(dword[CO2 + LDC * 1], xmm11);
        lea(CO1, ptr[CO1 + LDC * 4 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 4]);
        sub(I, 0x4);
        cmp(I, 0x4);
        jge(l2208, T_NEAR);
        align(4);

        L(l2534);
        test(I, 0x2);
        jle(l2844, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movss(xmm0, dword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movss(xmm2, dword[A - 0x7c]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movddup(xmm4, qword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movddup(xmm5, qword[BO - 0x78]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l27a8, T_NEAR);
        sub(H, 0x1e);
        jle(l2698, T_NEAR);
        align(4);

        L(l2598);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l2598, T_NEAR);
        align(4);

        L(l2698);
        prefetcht0(byte[CO1 + 0x0]);
        prefetcht0(byte[CO1 + LDC * 1 + 0x0]);
        add(H, 0x1e);
        align(4);

        L(l26a8);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x68]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x60]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movddup(xmm5, qword[BO - 0x58]);
        add(AA, 0x8);
        sub(BO, -32);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l26a8, T_NEAR);
        align(4);

        L(l27a8);
        mov(H, K);
        and_(H, 0x3);
        je(l2804, T_NEAR);
        align(4);

        L(l27b4);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movddup(xmm4, qword[BO - 0x78]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x7c]);
        addps(xmm11, xmm6);
        sub(AO, -4);
        sub(BO, -8);
        dec(H);
        jg(l27b4, T_NEAR);
        align(4);

        L(l2804);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movss(dword[CO1 + 0x0], xmm8);
        movss(dword[CO1 + LDC * 1 + 0x0], xmm9);
        lea(CO1, ptr[CO1 + LDC * 2 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 2]);
        align(4);

        L(l2844);
        test(I, 0x1);
        jle(l2b4c, T_NEAR);
        lea(CO2, ptr[CO1 + LDC * 2 + 0x0]);
        movss(xmm0, dword[A - 0x80]);
        xorps(xmm8, xmm8);
        xorps(xmm9, xmm9);
        movss(xmm2, dword[A - 0x7c]);
        xorps(xmm10, xmm10);
        xorps(xmm11, xmm11);
        movss(xmm4, dword[BO - 0x80]);
        xorps(xmm12, xmm12);
        movss(xmm5, dword[BO - 0x7c]);
        xorps(xmm13, xmm13);
        xorps(xmm14, xmm14);
        xorps(xmm15, xmm15);
        mov(AO, A);
        mov(H, K);
        sar(H, 0x2);
        jle(l2ab4, T_NEAR);
        sub(H, 0x1e);
        jle(l29a8, T_NEAR);
        align(4);

        L(l28a8);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l28a8, T_NEAR);
        align(4);

        L(l29a8);
        prefetcht0(byte[CO1 + 0x0]);
        add(H, 0x1e);
        align(4);

        L(l29b4);
        prefetcht0(byte[AO + 0x180]);
        prefetcht0(byte[BO + 0x100]);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x78]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x78]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x74]);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x74]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x70]);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x70]);
        addps(xmm11, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm8, xmm5);
        pshufd(xmm5, xmm6, 0x1b);
        mulps(xmm6, xmm2);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm5, 0xb1);
        mulps(xmm5, xmm2);
        addps(xmm10, xmm5);
        movss(xmm5, dword[BO - 0x6c]);
        add(AA, 0x8);
        sub(BO, -16);
        mulps(xmm6, xmm2);
        movss(xmm2, dword[AO - 0x6c]);
        sub(AO, -16);
        addps(xmm11, xmm6);
        prefetcht0(byte[AA - 0x78]);
        sub(H, 0x1);
        jg(l29b4, T_NEAR);
        align(4);

        L(l2ab4);
        mov(H, K);
        and_(H, 0x3);
        je(l2b10, T_NEAR);
        align(4);

        L(l2ac0);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm8, xmm4);
        pshufd(xmm4, xmm6, 0x1b);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        addps(xmm9, xmm6);
        pshufd(xmm6, xmm4, 0xb1);
        movaps(xmm7, xmm4);
        mulps(xmm4, xmm0);
        addps(xmm10, xmm4);
        movss(xmm4, dword[BO - 0x7c]);
        movaps(xmm7, xmm6);
        mulps(xmm6, xmm0);
        movss(xmm0, dword[AO - 0x7c]);
        addps(xmm11, xmm6);
        sub(AO, -4);
        sub(BO, -4);
        dec(H);
        jg(l2ac0, T_NEAR);
        align(4);

        L(l2b10);
        movaps(xmm0, xmm8);
        unpcklpd(xmm8, xmm9);
        unpckhpd(xmm0, xmm9);
        movaps(xmm1, xmm10);
        unpckhpd(xmm10, xmm11);
        unpcklpd(xmm1, xmm11);
        movaps(xmm9, xmm8);
        shufps(xmm8, xmm10, 0xcc);
        shufps(xmm9, xmm10, 0x66);
        movss(dword[CO1 + 0x0], xmm8);
        lea(CO1, ptr[CO1 + LDC * 1 + 0x0]);
        lea(CO2, ptr[CO2 + LDC * 1]);
        align(4);

        L(l2b4c);
        mov(A, AO);
        align(4);

        L(l2b50);

        postamble();
    }
    outLocalLabel();

#undef M
#undef N
#undef K
#undef A
#undef B
#undef C
#undef LDC
#undef AA
#undef I
#undef J
#undef H
#undef AO
#undef BO
#undef CO1
#undef CO2
#ifdef _WIN32
#undef OLD_A
#undef OLD_B
#endif
#undef OLD_C
#undef OLD_LDC
}

} // namespace x64
} // namespace cpu
} // namespace impl
} // namespace dnnl
